IMPORTING LIBRARIES & DATA¶
In [5]:
import warnings
warnings.filterwarnings('ignore')
In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [8]:
pd.set_option('display.max_columns', None)
In [9]:
app_data = pd.read_csv('application_data.csv.zip')
prev_data = pd.read_csv('previous_application.csv.zip')
In [10]:
print("Application Data Shape:", app_data.shape)
print("Previous Application Data Shape:", prev_data.shape)
Application Data Shape: (307511, 122) Previous Application Data Shape: (1670214, 37)
In [11]:
print(app_data.info())
print(app_data.head())
print(app_data.describe())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
None
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR \
0 100002 1 Cash loans M N
1 100003 0 Cash loans F N
2 100004 0 Revolving loans M Y
3 100006 0 Cash loans F N
4 100007 0 Cash loans M N
FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY \
0 Y 0 202500.0 406597.5 24700.5
1 N 0 270000.0 1293502.5 35698.5
2 Y 0 67500.0 135000.0 6750.0
3 Y 0 135000.0 312682.5 29686.5
4 Y 0 121500.0 513000.0 21865.5
AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE \
0 351000.0 Unaccompanied Working
1 1129500.0 Family State servant
2 135000.0 Unaccompanied Working
3 297000.0 Unaccompanied Working
4 513000.0 Unaccompanied Working
NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE \
0 Secondary / secondary special Single / not married House / apartment
1 Higher education Married House / apartment
2 Secondary / secondary special Single / not married House / apartment
3 Secondary / secondary special Civil marriage House / apartment
4 Secondary / secondary special Single / not married House / apartment
REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION \
0 0.018801 -9461 -637 -3648.0
1 0.003541 -16765 -1188 -1186.0
2 0.010032 -19046 -225 -4260.0
3 0.008019 -19005 -3039 -9833.0
4 0.028663 -19932 -3038 -4311.0
DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE \
0 -2120 NaN 1 1 0
1 -291 NaN 1 1 0
2 -2531 26.0 1 1 1
3 -2437 NaN 1 1 0
4 -3458 NaN 1 1 0
FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS \
0 1 1 0 Laborers 1.0
1 1 1 0 Core staff 2.0
2 1 1 0 Laborers 1.0
3 1 0 0 Laborers 2.0
4 1 0 0 Core staff 1.0
REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY \
0 2 2
1 1 1
2 2 2
3 2 2
4 2 2
WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START \
0 WEDNESDAY 10
1 MONDAY 11
2 MONDAY 9
3 WEDNESDAY 17
4 THURSDAY 11
REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION \
0 0 0
1 0 0
2 0 0
3 0 0
4 0 0
LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY \
0 0 0
1 0 0
2 0 0
3 0 0
4 0 0
REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE \
0 0 0 Business Entity Type 3
1 0 0 School
2 0 0 Government
3 0 0 Business Entity Type 3
4 1 1 Religion
EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG \
0 0.083037 0.262949 0.139376 0.0247 0.0369
1 0.311267 0.622246 NaN 0.0959 0.0529
2 NaN 0.555912 0.729567 NaN NaN
3 NaN 0.650442 NaN NaN NaN
4 NaN 0.322738 NaN NaN NaN
YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG \
0 0.9722 0.6192 0.0143
1 0.9851 0.7960 0.0605
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG \
0 0.00 0.0690 0.0833 0.1250 0.0369
1 0.08 0.0345 0.2917 0.3333 0.0130
2 NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN
LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG \
0 0.0202 0.0190 0.0000
1 0.0773 0.0549 0.0039
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE \
0 0.0000 0.0252 0.0383
1 0.0098 0.0924 0.0538
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE \
0 0.9722 0.6341 0.0144
1 0.9851 0.8040 0.0497
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE \
0 0.0000 0.0690 0.0833 0.1250
1 0.0806 0.0345 0.2917 0.3333
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE \
0 0.0377 0.022 0.0198
1 0.0128 0.079 0.0554
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI \
0 0.0 0.0 0.0250
1 0.0 0.0 0.0968
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI \
0 0.0369 0.9722 0.6243
1 0.0529 0.9851 0.7987
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI \
0 0.0144 0.00 0.0690 0.0833
1 0.0608 0.08 0.0345 0.2917
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI \
0 0.1250 0.0375 0.0205 0.0193
1 0.3333 0.0132 0.0787 0.0558
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE \
0 0.0000 0.00 reg oper account
1 0.0039 0.01 reg oper account
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE \
0 block of flats 0.0149 Stone, brick No
1 block of flats 0.0714 Block No
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE \
0 2.0 2.0
1 1.0 0.0
2 0.0 0.0
3 2.0 0.0
4 0.0 0.0
OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE \
0 2.0 2.0 -1134.0
1 1.0 0.0 -828.0
2 0.0 0.0 -815.0
3 2.0 0.0 -617.0
4 0.0 0.0 -1106.0
FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 \
0 0 1 0 0
1 0 1 0 0
2 0 0 0 0
3 0 1 0 0
4 0 0 0 0
FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 \
0 0 0 0 0
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 1 0
FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 \
0 0 0 0 0
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 \
0 0 0 0 0
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 \
0 0 0 0 0
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY \
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 NaN NaN
4 0.0 0.0
AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON \
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 NaN NaN
4 0.0 0.0
AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
0 0.0 1.0
1 0.0 0.0
2 0.0 0.0
3 NaN NaN
4 0.0 0.0
SK_ID_CURR TARGET CNT_CHILDREN AMT_INCOME_TOTAL \
count 307511.000000 307511.000000 307511.000000 3.075110e+05
mean 278180.518577 0.080729 0.417052 1.687979e+05
std 102790.175348 0.272419 0.722121 2.371231e+05
min 100002.000000 0.000000 0.000000 2.565000e+04
25% 189145.500000 0.000000 0.000000 1.125000e+05
50% 278202.000000 0.000000 0.000000 1.471500e+05
75% 367142.500000 0.000000 1.000000 2.025000e+05
max 456255.000000 1.000000 19.000000 1.170000e+08
AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE \
count 3.075110e+05 307499.000000 3.072330e+05
mean 5.990260e+05 27108.573909 5.383962e+05
std 4.024908e+05 14493.737315 3.694465e+05
min 4.500000e+04 1615.500000 4.050000e+04
25% 2.700000e+05 16524.000000 2.385000e+05
50% 5.135310e+05 24903.000000 4.500000e+05
75% 8.086500e+05 34596.000000 6.795000e+05
max 4.050000e+06 258025.500000 4.050000e+06
REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED \
count 307511.000000 307511.000000 307511.000000
mean 0.020868 -16036.995067 63815.045904
std 0.013831 4363.988632 141275.766519
min 0.000290 -25229.000000 -17912.000000
25% 0.010006 -19682.000000 -2760.000000
50% 0.018850 -15750.000000 -1213.000000
75% 0.028663 -12413.000000 -289.000000
max 0.072508 -7489.000000 365243.000000
DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL \
count 307511.000000 307511.000000 104582.000000 307511.000000
mean -4986.120328 -2994.202373 12.061091 0.999997
std 3522.886321 1509.450419 11.944812 0.001803
min -24672.000000 -7197.000000 0.000000 0.000000
25% -7479.500000 -4299.000000 5.000000 1.000000
50% -4504.000000 -3254.000000 9.000000 1.000000
75% -2010.000000 -1720.000000 15.000000 1.000000
max 0.000000 0.000000 91.000000 1.000000
FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE \
count 307511.000000 307511.000000 307511.000000 307511.000000
mean 0.819889 0.199368 0.998133 0.281066
std 0.384280 0.399526 0.043164 0.449521
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 1.000000 0.000000
50% 1.000000 0.000000 1.000000 0.000000
75% 1.000000 0.000000 1.000000 1.000000
max 1.000000 1.000000 1.000000 1.000000
FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT \
count 307511.000000 307509.000000 307511.000000
mean 0.056720 2.152665 2.052463
std 0.231307 0.910682 0.509034
min 0.000000 1.000000 1.000000
25% 0.000000 2.000000 2.000000
50% 0.000000 2.000000 2.000000
75% 0.000000 3.000000 2.000000
max 1.000000 20.000000 3.000000
REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START \
count 307511.000000 307511.000000
mean 2.031521 12.063419
std 0.502737 3.265832
min 1.000000 0.000000
25% 2.000000 10.000000
50% 2.000000 12.000000
75% 2.000000 14.000000
max 3.000000 23.000000
REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION \
count 307511.000000 307511.000000
mean 0.015144 0.050769
std 0.122126 0.219526
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.000000 0.000000
max 1.000000 1.000000
LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY \
count 307511.000000 307511.000000
mean 0.040659 0.078173
std 0.197499 0.268444
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.000000 0.000000
max 1.000000 1.000000
REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 \
count 307511.000000 307511.000000 134133.000000
mean 0.230454 0.179555 0.502130
std 0.421124 0.383817 0.211062
min 0.000000 0.000000 0.014568
25% 0.000000 0.000000 0.334007
50% 0.000000 0.000000 0.505998
75% 0.000000 0.000000 0.675053
max 1.000000 1.000000 0.962693
EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG \
count 3.068510e+05 246546.000000 151450.00000 127568.000000
mean 5.143927e-01 0.510853 0.11744 0.088442
std 1.910602e-01 0.194844 0.10824 0.082438
min 8.170000e-08 0.000527 0.00000 0.000000
25% 3.924574e-01 0.370650 0.05770 0.044200
50% 5.659614e-01 0.535276 0.08760 0.076300
75% 6.636171e-01 0.669057 0.14850 0.112200
max 8.549997e-01 0.896010 1.00000 1.000000
YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG \
count 157504.000000 103023.000000 92646.000000
mean 0.977735 0.752471 0.044621
std 0.059223 0.113280 0.076036
min 0.000000 0.000000 0.000000
25% 0.976700 0.687200 0.007800
50% 0.981600 0.755200 0.021100
75% 0.986600 0.823200 0.051500
max 1.000000 1.000000 1.000000
ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG \
count 143620.000000 152683.000000 154491.000000 98869.000000
mean 0.078942 0.149725 0.226282 0.231894
std 0.134576 0.100049 0.144641 0.161380
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.069000 0.166700 0.083300
50% 0.000000 0.137900 0.166700 0.208300
75% 0.120000 0.206900 0.333300 0.375000
max 1.000000 1.000000 1.000000 1.000000
LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG \
count 124921.000000 97312.000000 153161.000000
mean 0.066333 0.100775 0.107399
std 0.081184 0.092576 0.110565
min 0.000000 0.000000 0.000000
25% 0.018700 0.050400 0.045300
50% 0.048100 0.075600 0.074500
75% 0.085600 0.121000 0.129900
max 1.000000 1.000000 1.000000
NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE \
count 93997.000000 137829.000000 151450.000000
mean 0.008809 0.028358 0.114231
std 0.047732 0.069523 0.107936
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.052500
50% 0.000000 0.003600 0.084000
75% 0.003900 0.027700 0.143900
max 1.000000 1.000000 1.000000
BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE \
count 127568.000000 157504.000000 103023.000000
mean 0.087543 0.977065 0.759637
std 0.084307 0.064575 0.110111
min 0.000000 0.000000 0.000000
25% 0.040700 0.976700 0.699400
50% 0.074600 0.981600 0.764800
75% 0.112400 0.986600 0.823600
max 1.000000 1.000000 1.000000
COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE \
count 92646.000000 143620.000000 152683.000000 154491.000000
mean 0.042553 0.074490 0.145193 0.222315
std 0.074445 0.132256 0.100977 0.143709
min 0.000000 0.000000 0.000000 0.000000
25% 0.007200 0.000000 0.069000 0.166700
50% 0.019000 0.000000 0.137900 0.166700
75% 0.049000 0.120800 0.206900 0.333300
max 1.000000 1.000000 1.000000 1.000000
FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE \
count 98869.000000 124921.000000 97312.000000 153161.000000
mean 0.228058 0.064958 0.105645 0.105975
std 0.161160 0.081750 0.097880 0.111845
min 0.000000 0.000000 0.000000 0.000000
25% 0.083300 0.016600 0.054200 0.042700
50% 0.208300 0.045800 0.077100 0.073100
75% 0.375000 0.084100 0.131300 0.125200
max 1.000000 1.000000 1.000000 1.000000
NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI \
count 93997.000000 137829.000000 151450.000000
mean 0.008076 0.027022 0.117850
std 0.046276 0.070254 0.109076
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.058300
50% 0.000000 0.001100 0.086400
75% 0.003900 0.023100 0.148900
max 1.000000 1.000000 1.000000
BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI \
count 127568.000000 157504.000000 103023.000000
mean 0.087955 0.977752 0.755746
std 0.082179 0.059897 0.112066
min 0.000000 0.000000 0.000000
25% 0.043700 0.976700 0.691400
50% 0.075800 0.981600 0.758500
75% 0.111600 0.986600 0.825600
max 1.000000 1.000000 1.000000
COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI \
count 92646.000000 143620.000000 152683.000000 154491.000000
mean 0.044595 0.078078 0.149213 0.225897
std 0.076144 0.134467 0.100368 0.145067
min 0.000000 0.000000 0.000000 0.000000
25% 0.007900 0.000000 0.069000 0.166700
50% 0.020800 0.000000 0.137900 0.166700
75% 0.051300 0.120000 0.206900 0.333300
max 1.000000 1.000000 1.000000 1.000000
FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI \
count 98869.000000 124921.000000 97312.000000 153161.000000
mean 0.231625 0.067169 0.101954 0.108607
std 0.161934 0.082167 0.093642 0.112260
min 0.000000 0.000000 0.000000 0.000000
25% 0.083300 0.018700 0.051300 0.045700
50% 0.208300 0.048700 0.076100 0.074900
75% 0.375000 0.086800 0.123100 0.130300
max 1.000000 1.000000 1.000000 1.000000
NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI TOTALAREA_MODE \
count 93997.000000 137829.000000 159080.000000
mean 0.008651 0.028236 0.102547
std 0.047415 0.070166 0.107462
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.041200
50% 0.000000 0.003100 0.068800
75% 0.003900 0.026600 0.127600
max 1.000000 1.000000 1.000000
OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE \
count 306490.000000 306490.000000
mean 1.422245 0.143421
std 2.400989 0.446698
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 2.000000 0.000000
max 348.000000 34.000000
OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE \
count 306490.000000 306490.000000
mean 1.405292 0.100049
std 2.379803 0.362291
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 2.000000 0.000000
max 344.000000 24.000000
DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 \
count 307510.000000 307511.000000 307511.000000
mean -962.858788 0.000042 0.710023
std 826.808487 0.006502 0.453752
min -4292.000000 0.000000 0.000000
25% -1570.000000 0.000000 0.000000
50% -757.000000 0.000000 1.000000
75% -274.000000 0.000000 1.000000
max 0.000000 1.000000 1.000000
FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 \
count 307511.000000 307511.000000 307511.000000 307511.000000
mean 0.000081 0.015115 0.088055 0.000192
std 0.009016 0.122010 0.283376 0.013850
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000
FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 \
count 307511.000000 307511.000000 307511.000000 307511.000000
mean 0.081376 0.003896 0.000023 0.003912
std 0.273412 0.062295 0.004771 0.062424
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000
FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 \
count 307511.000000 307511.000000 307511.000000 307511.00000
mean 0.000007 0.003525 0.002936 0.00121
std 0.002550 0.059268 0.054110 0.03476
min 0.000000 0.000000 0.000000 0.00000
25% 0.000000 0.000000 0.000000 0.00000
50% 0.000000 0.000000 0.000000 0.00000
75% 0.000000 0.000000 0.000000 0.00000
max 1.000000 1.000000 1.000000 1.00000
FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 \
count 307511.000000 307511.000000 307511.000000 307511.000000
mean 0.009928 0.000267 0.008130 0.000595
std 0.099144 0.016327 0.089798 0.024387
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000
FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR \
count 307511.000000 307511.000000 265992.000000
mean 0.000507 0.000335 0.006402
std 0.022518 0.018299 0.083849
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000
max 1.000000 1.000000 4.000000
AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK \
count 265992.000000 265992.000000
mean 0.007000 0.034362
std 0.110757 0.204685
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.000000 0.000000
max 9.000000 8.000000
AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT \
count 265992.000000 265992.000000
mean 0.267395 0.265474
std 0.916002 0.794056
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.000000 0.000000
max 27.000000 261.000000
AMT_REQ_CREDIT_BUREAU_YEAR
count 265992.000000
mean 1.899974
std 1.869295
min 0.000000
25% 0.000000
50% 1.000000
75% 3.000000
max 25.000000
In [12]:
print("Missing Values Count:")
print(app_data.isnull().sum().sort_values(ascending=False))
Missing Values Count:
COMMONAREA_MEDI 214865
COMMONAREA_AVG 214865
COMMONAREA_MODE 214865
NONLIVINGAPARTMENTS_MODE 213514
NONLIVINGAPARTMENTS_AVG 213514
...
NAME_HOUSING_TYPE 0
NAME_FAMILY_STATUS 0
NAME_EDUCATION_TYPE 0
NAME_INCOME_TYPE 0
SK_ID_CURR 0
Length: 122, dtype: int64
In [14]:
missing_percent = (app_data.isnull().sum() / len(app_data)) * 100
missing_df = pd.DataFrame({
'Column': app_data.columns,
'Missing_Count': app_data.isnull().sum(),
'Missing_Percentage': missing_percent
}).sort_values('Missing_Percentage', ascending=False)
print(missing_df[missing_df['Missing_Percentage'] > 0])
Column Missing_Count \
COMMONAREA_MEDI COMMONAREA_MEDI 214865
COMMONAREA_AVG COMMONAREA_AVG 214865
COMMONAREA_MODE COMMONAREA_MODE 214865
NONLIVINGAPARTMENTS_MODE NONLIVINGAPARTMENTS_MODE 213514
NONLIVINGAPARTMENTS_AVG NONLIVINGAPARTMENTS_AVG 213514
... ... ...
EXT_SOURCE_2 EXT_SOURCE_2 660
AMT_GOODS_PRICE AMT_GOODS_PRICE 278
AMT_ANNUITY AMT_ANNUITY 12
CNT_FAM_MEMBERS CNT_FAM_MEMBERS 2
DAYS_LAST_PHONE_CHANGE DAYS_LAST_PHONE_CHANGE 1
Missing_Percentage
COMMONAREA_MEDI 69.872297
COMMONAREA_AVG 69.872297
COMMONAREA_MODE 69.872297
NONLIVINGAPARTMENTS_MODE 69.432963
NONLIVINGAPARTMENTS_AVG 69.432963
... ...
EXT_SOURCE_2 0.214626
AMT_GOODS_PRICE 0.090403
AMT_ANNUITY 0.003902
CNT_FAM_MEMBERS 0.000650
DAYS_LAST_PHONE_CHANGE 0.000325
[67 rows x 3 columns]
In [15]:
print(prev_data.info())
print(prev_data.head())
print(prev_data.tail())
print(prev_data.describe())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 37 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SK_ID_PREV 1670214 non-null int64
1 SK_ID_CURR 1670214 non-null int64
2 NAME_CONTRACT_TYPE 1670214 non-null object
3 AMT_ANNUITY 1297979 non-null float64
4 AMT_APPLICATION 1670214 non-null float64
5 AMT_CREDIT 1670213 non-null float64
6 AMT_DOWN_PAYMENT 774370 non-null float64
7 AMT_GOODS_PRICE 1284699 non-null float64
8 WEEKDAY_APPR_PROCESS_START 1670214 non-null object
9 HOUR_APPR_PROCESS_START 1670214 non-null int64
10 FLAG_LAST_APPL_PER_CONTRACT 1670214 non-null object
11 NFLAG_LAST_APPL_IN_DAY 1670214 non-null int64
12 RATE_DOWN_PAYMENT 774370 non-null float64
13 RATE_INTEREST_PRIMARY 5951 non-null float64
14 RATE_INTEREST_PRIVILEGED 5951 non-null float64
15 NAME_CASH_LOAN_PURPOSE 1670214 non-null object
16 NAME_CONTRACT_STATUS 1670214 non-null object
17 DAYS_DECISION 1670214 non-null int64
18 NAME_PAYMENT_TYPE 1670214 non-null object
19 CODE_REJECT_REASON 1670214 non-null object
20 NAME_TYPE_SUITE 849809 non-null object
21 NAME_CLIENT_TYPE 1670214 non-null object
22 NAME_GOODS_CATEGORY 1670214 non-null object
23 NAME_PORTFOLIO 1670214 non-null object
24 NAME_PRODUCT_TYPE 1670214 non-null object
25 CHANNEL_TYPE 1670214 non-null object
26 SELLERPLACE_AREA 1670214 non-null int64
27 NAME_SELLER_INDUSTRY 1670214 non-null object
28 CNT_PAYMENT 1297984 non-null float64
29 NAME_YIELD_GROUP 1670214 non-null object
30 PRODUCT_COMBINATION 1669868 non-null object
31 DAYS_FIRST_DRAWING 997149 non-null float64
32 DAYS_FIRST_DUE 997149 non-null float64
33 DAYS_LAST_DUE_1ST_VERSION 997149 non-null float64
34 DAYS_LAST_DUE 997149 non-null float64
35 DAYS_TERMINATION 997149 non-null float64
36 NFLAG_INSURED_ON_APPROVAL 997149 non-null float64
dtypes: float64(15), int64(6), object(16)
memory usage: 471.5+ MB
None
SK_ID_PREV SK_ID_CURR NAME_CONTRACT_TYPE AMT_ANNUITY AMT_APPLICATION \
0 2030495 271877 Consumer loans 1730.430 17145.0
1 2802425 108129 Cash loans 25188.615 607500.0
2 2523466 122040 Cash loans 15060.735 112500.0
3 2819243 176158 Cash loans 47041.335 450000.0
4 1784265 202054 Cash loans 31924.395 337500.0
AMT_CREDIT AMT_DOWN_PAYMENT AMT_GOODS_PRICE WEEKDAY_APPR_PROCESS_START \
0 17145.0 0.0 17145.0 SATURDAY
1 679671.0 NaN 607500.0 THURSDAY
2 136444.5 NaN 112500.0 TUESDAY
3 470790.0 NaN 450000.0 MONDAY
4 404055.0 NaN 337500.0 THURSDAY
HOUR_APPR_PROCESS_START FLAG_LAST_APPL_PER_CONTRACT \
0 15 Y
1 11 Y
2 11 Y
3 7 Y
4 9 Y
NFLAG_LAST_APPL_IN_DAY RATE_DOWN_PAYMENT RATE_INTEREST_PRIMARY \
0 1 0.0 0.182832
1 1 NaN NaN
2 1 NaN NaN
3 1 NaN NaN
4 1 NaN NaN
RATE_INTEREST_PRIVILEGED NAME_CASH_LOAN_PURPOSE NAME_CONTRACT_STATUS \
0 0.867336 XAP Approved
1 NaN XNA Approved
2 NaN XNA Approved
3 NaN XNA Approved
4 NaN Repairs Refused
DAYS_DECISION NAME_PAYMENT_TYPE CODE_REJECT_REASON NAME_TYPE_SUITE \
0 -73 Cash through the bank XAP NaN
1 -164 XNA XAP Unaccompanied
2 -301 Cash through the bank XAP Spouse, partner
3 -512 Cash through the bank XAP NaN
4 -781 Cash through the bank HC NaN
NAME_CLIENT_TYPE NAME_GOODS_CATEGORY NAME_PORTFOLIO NAME_PRODUCT_TYPE \
0 Repeater Mobile POS XNA
1 Repeater XNA Cash x-sell
2 Repeater XNA Cash x-sell
3 Repeater XNA Cash x-sell
4 Repeater XNA Cash walk-in
CHANNEL_TYPE SELLERPLACE_AREA NAME_SELLER_INDUSTRY \
0 Country-wide 35 Connectivity
1 Contact center -1 XNA
2 Credit and cash offices -1 XNA
3 Credit and cash offices -1 XNA
4 Credit and cash offices -1 XNA
CNT_PAYMENT NAME_YIELD_GROUP PRODUCT_COMBINATION DAYS_FIRST_DRAWING \
0 12.0 middle POS mobile with interest 365243.0
1 36.0 low_action Cash X-Sell: low 365243.0
2 12.0 high Cash X-Sell: high 365243.0
3 12.0 middle Cash X-Sell: middle 365243.0
4 24.0 high Cash Street: high NaN
DAYS_FIRST_DUE DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION \
0 -42.0 300.0 -42.0 -37.0
1 -134.0 916.0 365243.0 365243.0
2 -271.0 59.0 365243.0 365243.0
3 -482.0 -152.0 -182.0 -177.0
4 NaN NaN NaN NaN
NFLAG_INSURED_ON_APPROVAL
0 0.0
1 1.0
2 1.0
3 1.0
4 NaN
SK_ID_PREV SK_ID_CURR NAME_CONTRACT_TYPE AMT_ANNUITY \
1670209 2300464 352015 Consumer loans 14704.290
1670210 2357031 334635 Consumer loans 6622.020
1670211 2659632 249544 Consumer loans 11520.855
1670212 2785582 400317 Cash loans 18821.520
1670213 2418762 261212 Cash loans 16431.300
AMT_APPLICATION AMT_CREDIT AMT_DOWN_PAYMENT AMT_GOODS_PRICE \
1670209 267295.5 311400.0 0.0 267295.5
1670210 87750.0 64291.5 29250.0 87750.0
1670211 105237.0 102523.5 10525.5 105237.0
1670212 180000.0 191880.0 NaN 180000.0
1670213 360000.0 360000.0 NaN 360000.0
WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START \
1670209 WEDNESDAY 12
1670210 TUESDAY 15
1670211 MONDAY 12
1670212 WEDNESDAY 9
1670213 SUNDAY 10
FLAG_LAST_APPL_PER_CONTRACT NFLAG_LAST_APPL_IN_DAY \
1670209 Y 1
1670210 Y 1
1670211 Y 1
1670212 Y 1
1670213 Y 1
RATE_DOWN_PAYMENT RATE_INTEREST_PRIMARY RATE_INTEREST_PRIVILEGED \
1670209 0.000000 NaN NaN
1670210 0.340554 NaN NaN
1670211 0.101401 NaN NaN
1670212 NaN NaN NaN
1670213 NaN NaN NaN
NAME_CASH_LOAN_PURPOSE NAME_CONTRACT_STATUS DAYS_DECISION \
1670209 XAP Approved -544
1670210 XAP Approved -1694
1670211 XAP Approved -1488
1670212 XNA Approved -1185
1670213 XNA Approved -1193
NAME_PAYMENT_TYPE CODE_REJECT_REASON NAME_TYPE_SUITE \
1670209 Cash through the bank XAP NaN
1670210 Cash through the bank XAP Unaccompanied
1670211 Cash through the bank XAP Spouse, partner
1670212 Cash through the bank XAP Family
1670213 Cash through the bank XAP Family
NAME_CLIENT_TYPE NAME_GOODS_CATEGORY NAME_PORTFOLIO \
1670209 Refreshed Furniture POS
1670210 New Furniture POS
1670211 Repeater Consumer Electronics POS
1670212 Repeater XNA Cash
1670213 Repeater XNA Cash
NAME_PRODUCT_TYPE CHANNEL_TYPE SELLERPLACE_AREA \
1670209 XNA Stone 43
1670210 XNA Stone 43
1670211 XNA Country-wide 1370
1670212 x-sell AP+ (Cash loan) -1
1670213 x-sell AP+ (Cash loan) -1
NAME_SELLER_INDUSTRY CNT_PAYMENT NAME_YIELD_GROUP \
1670209 Furniture 30.0 low_normal
1670210 Furniture 12.0 middle
1670211 Consumer electronics 10.0 low_normal
1670212 XNA 12.0 low_normal
1670213 XNA 48.0 middle
PRODUCT_COMBINATION DAYS_FIRST_DRAWING DAYS_FIRST_DUE \
1670209 POS industry with interest 365243.0 -508.0
1670210 POS industry with interest 365243.0 -1604.0
1670211 POS household with interest 365243.0 -1457.0
1670212 Cash X-Sell: low 365243.0 -1155.0
1670213 Cash X-Sell: middle 365243.0 -1163.0
DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION \
1670209 362.0 -358.0 -351.0
1670210 -1274.0 -1304.0 -1297.0
1670211 -1187.0 -1187.0 -1181.0
1670212 -825.0 -825.0 -817.0
1670213 247.0 -443.0 -423.0
NFLAG_INSURED_ON_APPROVAL
1670209 0.0
1670210 0.0
1670211 0.0
1670212 1.0
1670213 0.0
SK_ID_PREV SK_ID_CURR AMT_ANNUITY AMT_APPLICATION \
count 1.670214e+06 1.670214e+06 1.297979e+06 1.670214e+06
mean 1.923089e+06 2.783572e+05 1.595512e+04 1.752339e+05
std 5.325980e+05 1.028148e+05 1.478214e+04 2.927798e+05
min 1.000001e+06 1.000010e+05 0.000000e+00 0.000000e+00
25% 1.461857e+06 1.893290e+05 6.321780e+03 1.872000e+04
50% 1.923110e+06 2.787145e+05 1.125000e+04 7.104600e+04
75% 2.384280e+06 3.675140e+05 2.065842e+04 1.803600e+05
max 2.845382e+06 4.562550e+05 4.180581e+05 6.905160e+06
AMT_CREDIT AMT_DOWN_PAYMENT AMT_GOODS_PRICE \
count 1.670213e+06 7.743700e+05 1.284699e+06
mean 1.961140e+05 6.697402e+03 2.278473e+05
std 3.185746e+05 2.092150e+04 3.153966e+05
min 0.000000e+00 -9.000000e-01 0.000000e+00
25% 2.416050e+04 0.000000e+00 5.084100e+04
50% 8.054100e+04 1.638000e+03 1.123200e+05
75% 2.164185e+05 7.740000e+03 2.340000e+05
max 6.905160e+06 3.060045e+06 6.905160e+06
HOUR_APPR_PROCESS_START NFLAG_LAST_APPL_IN_DAY RATE_DOWN_PAYMENT \
count 1.670214e+06 1.670214e+06 774370.000000
mean 1.248418e+01 9.964675e-01 0.079637
std 3.334028e+00 5.932963e-02 0.107823
min 0.000000e+00 0.000000e+00 -0.000015
25% 1.000000e+01 1.000000e+00 0.000000
50% 1.200000e+01 1.000000e+00 0.051605
75% 1.500000e+01 1.000000e+00 0.108909
max 2.300000e+01 1.000000e+00 1.000000
RATE_INTEREST_PRIMARY RATE_INTEREST_PRIVILEGED DAYS_DECISION \
count 5951.000000 5951.000000 1.670214e+06
mean 0.188357 0.773503 -8.806797e+02
std 0.087671 0.100879 7.790997e+02
min 0.034781 0.373150 -2.922000e+03
25% 0.160716 0.715645 -1.300000e+03
50% 0.189122 0.835095 -5.810000e+02
75% 0.193330 0.852537 -2.800000e+02
max 1.000000 1.000000 -1.000000e+00
SELLERPLACE_AREA CNT_PAYMENT DAYS_FIRST_DRAWING DAYS_FIRST_DUE \
count 1.670214e+06 1.297984e+06 997149.000000 997149.000000
mean 3.139511e+02 1.605408e+01 342209.855039 13826.269337
std 7.127443e+03 1.456729e+01 88916.115833 72444.869708
min -1.000000e+00 0.000000e+00 -2922.000000 -2892.000000
25% -1.000000e+00 6.000000e+00 365243.000000 -1628.000000
50% 3.000000e+00 1.200000e+01 365243.000000 -831.000000
75% 8.200000e+01 2.400000e+01 365243.000000 -411.000000
max 4.000000e+06 8.400000e+01 365243.000000 365243.000000
DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION \
count 997149.000000 997149.000000 997149.000000
mean 33767.774054 76582.403064 81992.343838
std 106857.034789 149647.415123 153303.516729
min -2801.000000 -2889.000000 -2874.000000
25% -1242.000000 -1314.000000 -1270.000000
50% -361.000000 -537.000000 -499.000000
75% 129.000000 -74.000000 -44.000000
max 365243.000000 365243.000000 365243.000000
NFLAG_INSURED_ON_APPROVAL
count 997149.000000
mean 0.332570
std 0.471134
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 1.000000
In [16]:
print("Missing Values Count:")
print(prev_data.isnull().sum().sort_values(ascending=False))
Missing Values Count: RATE_INTEREST_PRIVILEGED 1664263 RATE_INTEREST_PRIMARY 1664263 AMT_DOWN_PAYMENT 895844 RATE_DOWN_PAYMENT 895844 NAME_TYPE_SUITE 820405 NFLAG_INSURED_ON_APPROVAL 673065 DAYS_TERMINATION 673065 DAYS_LAST_DUE 673065 DAYS_LAST_DUE_1ST_VERSION 673065 DAYS_FIRST_DUE 673065 DAYS_FIRST_DRAWING 673065 AMT_GOODS_PRICE 385515 AMT_ANNUITY 372235 CNT_PAYMENT 372230 PRODUCT_COMBINATION 346 AMT_CREDIT 1 NAME_YIELD_GROUP 0 NAME_PORTFOLIO 0 NAME_SELLER_INDUSTRY 0 SELLERPLACE_AREA 0 CHANNEL_TYPE 0 NAME_PRODUCT_TYPE 0 SK_ID_PREV 0 NAME_GOODS_CATEGORY 0 NAME_CLIENT_TYPE 0 CODE_REJECT_REASON 0 SK_ID_CURR 0 DAYS_DECISION 0 NAME_CONTRACT_STATUS 0 NAME_CASH_LOAN_PURPOSE 0 NFLAG_LAST_APPL_IN_DAY 0 FLAG_LAST_APPL_PER_CONTRACT 0 HOUR_APPR_PROCESS_START 0 WEEKDAY_APPR_PROCESS_START 0 AMT_APPLICATION 0 NAME_CONTRACT_TYPE 0 NAME_PAYMENT_TYPE 0 dtype: int64
In [17]:
missing_percent = (prev_data.isnull().sum() / len(app_data)) * 100
missing_df = pd.DataFrame({
'Column': prev_data.columns,
'Missing_Count': prev_data.isnull().sum(),
'Missing_Percentage': missing_percent
}).sort_values('Missing_Percentage', ascending=False)
print(missing_df[missing_df['Missing_Percentage'] > 0])
Column Missing_Count \
RATE_INTEREST_PRIVILEGED RATE_INTEREST_PRIVILEGED 1664263
RATE_INTEREST_PRIMARY RATE_INTEREST_PRIMARY 1664263
AMT_DOWN_PAYMENT AMT_DOWN_PAYMENT 895844
RATE_DOWN_PAYMENT RATE_DOWN_PAYMENT 895844
NAME_TYPE_SUITE NAME_TYPE_SUITE 820405
NFLAG_INSURED_ON_APPROVAL NFLAG_INSURED_ON_APPROVAL 673065
DAYS_TERMINATION DAYS_TERMINATION 673065
DAYS_LAST_DUE DAYS_LAST_DUE 673065
DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE_1ST_VERSION 673065
DAYS_FIRST_DUE DAYS_FIRST_DUE 673065
DAYS_FIRST_DRAWING DAYS_FIRST_DRAWING 673065
AMT_GOODS_PRICE AMT_GOODS_PRICE 385515
AMT_ANNUITY AMT_ANNUITY 372235
CNT_PAYMENT CNT_PAYMENT 372230
PRODUCT_COMBINATION PRODUCT_COMBINATION 346
AMT_CREDIT AMT_CREDIT 1
Missing_Percentage
RATE_INTEREST_PRIVILEGED 541.204380
RATE_INTEREST_PRIMARY 541.204380
AMT_DOWN_PAYMENT 291.320961
RATE_DOWN_PAYMENT 291.320961
NAME_TYPE_SUITE 266.788830
NFLAG_INSURED_ON_APPROVAL 218.875097
DAYS_TERMINATION 218.875097
DAYS_LAST_DUE 218.875097
DAYS_LAST_DUE_1ST_VERSION 218.875097
DAYS_FIRST_DUE 218.875097
DAYS_FIRST_DRAWING 218.875097
AMT_GOODS_PRICE 125.366247
AMT_ANNUITY 121.047702
CNT_PAYMENT 121.046076
PRODUCT_COMBINATION 0.112516
AMT_CREDIT 0.000325
In [18]:
pd.set_option("display.max_rows", 100)
app_data.isnull().mean()*100
Out[18]:
SK_ID_CURR 0.000000
TARGET 0.000000
NAME_CONTRACT_TYPE 0.000000
CODE_GENDER 0.000000
FLAG_OWN_CAR 0.000000
...
AMT_REQ_CREDIT_BUREAU_DAY 13.501631
AMT_REQ_CREDIT_BUREAU_WEEK 13.501631
AMT_REQ_CREDIT_BUREAU_MON 13.501631
AMT_REQ_CREDIT_BUREAU_QRT 13.501631
AMT_REQ_CREDIT_BUREAU_YEAR 13.501631
Length: 122, dtype: float64
In [20]:
percentage = 47
threshold = int(((100-percentage)/100)*app_data.shape[0]+1)
app_df = app_data.dropna(axis=1,how = 'any')
app_df.head()
Out[20]:
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | Working | Secondary / secondary special | Single / not married | House / apartment | 0.018801 | -9461 | -637 | -3648.0 | -2120 | 1 | 1 | 0 | 1 | 1 | 0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | State servant | Higher education | Married | House / apartment | 0.003541 | -16765 | -1188 | -1186.0 | -291 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | Working | Secondary / secondary special | Single / not married | House / apartment | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 1 | 1 | 1 | 1 | 1 | 0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | Government | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | Working | Secondary / secondary special | Civil marriage | House / apartment | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | 1 | 1 | 0 | 1 | 0 | 0 | 2 | 2 | WEDNESDAY | 17 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | Working | Secondary / secondary special | Single / not married | House / apartment | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | 1 | 1 | 0 | 1 | 0 | 0 | 2 | 2 | THURSDAY | 11 | 0 | 0 | 0 | 0 | 1 | 1 | Religion | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
In [25]:
app_df.shape
Out[25]:
(307511, 76)
In [27]:
app_df.isnull().mean()*100
Out[27]:
SK_ID_CURR 0.0 TARGET 0.0 NAME_CONTRACT_TYPE 0.0 CODE_GENDER 0.0 FLAG_OWN_CAR 0.0 FLAG_OWN_REALTY 0.0 CNT_CHILDREN 0.0 AMT_INCOME_TOTAL 0.0 AMT_CREDIT 0.0 AMT_ANNUITY 0.0 AMT_GOODS_PRICE 0.0 NAME_INCOME_TYPE 0.0 NAME_EDUCATION_TYPE 0.0 NAME_FAMILY_STATUS 0.0 NAME_HOUSING_TYPE 0.0 REGION_POPULATION_RELATIVE 0.0 DAYS_BIRTH 0.0 DAYS_EMPLOYED 0.0 DAYS_REGISTRATION 0.0 DAYS_ID_PUBLISH 0.0 FLAG_MOBIL 0.0 FLAG_EMP_PHONE 0.0 FLAG_WORK_PHONE 0.0 FLAG_CONT_MOBILE 0.0 FLAG_PHONE 0.0 FLAG_EMAIL 0.0 CNT_FAM_MEMBERS 0.0 REGION_RATING_CLIENT 0.0 REGION_RATING_CLIENT_W_CITY 0.0 WEEKDAY_APPR_PROCESS_START 0.0 HOUR_APPR_PROCESS_START 0.0 REG_REGION_NOT_LIVE_REGION 0.0 REG_REGION_NOT_WORK_REGION 0.0 LIVE_REGION_NOT_WORK_REGION 0.0 REG_CITY_NOT_LIVE_CITY 0.0 REG_CITY_NOT_WORK_CITY 0.0 LIVE_CITY_NOT_WORK_CITY 0.0 ORGANIZATION_TYPE 0.0 EXT_SOURCE_2 0.0 EXT_SOURCE_3 0.0 OBS_30_CNT_SOCIAL_CIRCLE 0.0 DEF_30_CNT_SOCIAL_CIRCLE 0.0 OBS_60_CNT_SOCIAL_CIRCLE 0.0 DEF_60_CNT_SOCIAL_CIRCLE 0.0 DAYS_LAST_PHONE_CHANGE 0.0 FLAG_DOCUMENT_2 0.0 FLAG_DOCUMENT_3 0.0 FLAG_DOCUMENT_4 0.0 FLAG_DOCUMENT_5 0.0 FLAG_DOCUMENT_6 0.0 FLAG_DOCUMENT_7 0.0 FLAG_DOCUMENT_8 0.0 FLAG_DOCUMENT_9 0.0 FLAG_DOCUMENT_10 0.0 FLAG_DOCUMENT_11 0.0 FLAG_DOCUMENT_12 0.0 FLAG_DOCUMENT_13 0.0 FLAG_DOCUMENT_14 0.0 FLAG_DOCUMENT_15 0.0 FLAG_DOCUMENT_16 0.0 FLAG_DOCUMENT_17 0.0 FLAG_DOCUMENT_18 0.0 FLAG_DOCUMENT_19 0.0 FLAG_DOCUMENT_20 0.0 FLAG_DOCUMENT_21 0.0 AMT_REQ_CREDIT_BUREAU_HOUR 0.0 AMT_REQ_CREDIT_BUREAU_DAY 0.0 AMT_REQ_CREDIT_BUREAU_WEEK 0.0 AMT_REQ_CREDIT_BUREAU_MON 0.0 AMT_REQ_CREDIT_BUREAU_QRT 0.0 AMT_REQ_CREDIT_BUREAU_YEAR 0.0 YEARS_BIRTH 0.0 YEARS_EMPLOYED 0.0 YEARS_REGISTRATION 0.0 YEARS_ID_PUBLISH 0.0 YEARS_LAST_PHONE_CHANGE 0.0 dtype: float64
In [28]:
app_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 76 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null object 3 CODE_GENDER 307511 non-null object 4 FLAG_OWN_CAR 307511 non-null object 5 FLAG_OWN_REALTY 307511 non-null object 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307511 non-null float64 10 AMT_GOODS_PRICE 307511 non-null float64 11 NAME_INCOME_TYPE 307511 non-null object 12 NAME_EDUCATION_TYPE 307511 non-null object 13 NAME_FAMILY_STATUS 307511 non-null object 14 NAME_HOUSING_TYPE 307511 non-null object 15 REGION_POPULATION_RELATIVE 307511 non-null float64 16 DAYS_BIRTH 307511 non-null int64 17 DAYS_EMPLOYED 307511 non-null int64 18 DAYS_REGISTRATION 307511 non-null float64 19 DAYS_ID_PUBLISH 307511 non-null int64 20 FLAG_MOBIL 307511 non-null int64 21 FLAG_EMP_PHONE 307511 non-null int64 22 FLAG_WORK_PHONE 307511 non-null int64 23 FLAG_CONT_MOBILE 307511 non-null int64 24 FLAG_PHONE 307511 non-null int64 25 FLAG_EMAIL 307511 non-null int64 26 CNT_FAM_MEMBERS 307511 non-null float64 27 REGION_RATING_CLIENT 307511 non-null int64 28 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 29 WEEKDAY_APPR_PROCESS_START 307511 non-null object 30 HOUR_APPR_PROCESS_START 307511 non-null int64 31 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 32 REG_REGION_NOT_WORK_REGION 307511 non-null int64 33 LIVE_REGION_NOT_WORK_REGION 307511 non-null int64 34 REG_CITY_NOT_LIVE_CITY 307511 non-null int64 35 REG_CITY_NOT_WORK_CITY 307511 non-null int64 36 LIVE_CITY_NOT_WORK_CITY 307511 non-null int64 37 ORGANIZATION_TYPE 307511 non-null object 38 EXT_SOURCE_2 307511 non-null float64 39 EXT_SOURCE_3 307511 non-null float64 40 OBS_30_CNT_SOCIAL_CIRCLE 307511 non-null float64 41 DEF_30_CNT_SOCIAL_CIRCLE 307511 non-null float64 42 OBS_60_CNT_SOCIAL_CIRCLE 307511 non-null float64 43 DEF_60_CNT_SOCIAL_CIRCLE 307511 non-null float64 44 DAYS_LAST_PHONE_CHANGE 307511 non-null float64 45 FLAG_DOCUMENT_2 307511 non-null int64 46 FLAG_DOCUMENT_3 307511 non-null int64 47 FLAG_DOCUMENT_4 307511 non-null int64 48 FLAG_DOCUMENT_5 307511 non-null int64 49 FLAG_DOCUMENT_6 307511 non-null int64 50 FLAG_DOCUMENT_7 307511 non-null int64 51 FLAG_DOCUMENT_8 307511 non-null int64 52 FLAG_DOCUMENT_9 307511 non-null int64 53 FLAG_DOCUMENT_10 307511 non-null int64 54 FLAG_DOCUMENT_11 307511 non-null int64 55 FLAG_DOCUMENT_12 307511 non-null int64 56 FLAG_DOCUMENT_13 307511 non-null int64 57 FLAG_DOCUMENT_14 307511 non-null int64 58 FLAG_DOCUMENT_15 307511 non-null int64 59 FLAG_DOCUMENT_16 307511 non-null int64 60 FLAG_DOCUMENT_17 307511 non-null int64 61 FLAG_DOCUMENT_18 307511 non-null int64 62 FLAG_DOCUMENT_19 307511 non-null int64 63 FLAG_DOCUMENT_20 307511 non-null int64 64 FLAG_DOCUMENT_21 307511 non-null int64 65 AMT_REQ_CREDIT_BUREAU_HOUR 307511 non-null float64 66 AMT_REQ_CREDIT_BUREAU_DAY 307511 non-null float64 67 AMT_REQ_CREDIT_BUREAU_WEEK 307511 non-null float64 68 AMT_REQ_CREDIT_BUREAU_MON 307511 non-null float64 69 AMT_REQ_CREDIT_BUREAU_QRT 307511 non-null float64 70 AMT_REQ_CREDIT_BUREAU_YEAR 307511 non-null float64 71 YEARS_BIRTH 307511 non-null int64 72 YEARS_EMPLOYED 307511 non-null int64 73 YEARS_REGISTRATION 307511 non-null float64 74 YEARS_ID_PUBLISH 307511 non-null int64 75 YEARS_LAST_PHONE_CHANGE 307511 non-null float64 dtypes: float64(22), int64(44), object(10) memory usage: 178.3+ MB
In [34]:
app_df.EXT_SOURCE_3.isnull().mean()*100
Out[34]:
0.0
In [33]:
app_df.EXT_SOURCE_3.value_counts(normalize =True)*100
Out[33]:
EXT_SOURCE_3
0.535276 20.080908
0.746300 0.474780
0.713631 0.427627
0.694093 0.414945
0.670652 0.387303
...
0.021492 0.000325
0.019468 0.000325
0.023062 0.000325
0.014556 0.000325
0.043227 0.000325
Name: proportion, Length: 814, dtype: float64
In [36]:
app_df.EXT_SOURCE_3.describe()
Out[36]:
count 307511.000000 mean 0.515695 std 0.174736 min 0.000527 25% 0.417100 50% 0.535276 75% 0.636376 max 0.896010 Name: EXT_SOURCE_3, dtype: float64
In [32]:
for col in app_data.select_dtypes(include = [np.number]).columns:
median = app_data[col].median()
app_data[col].fillna(median, inplace = True)
In [37]:
sns.boxplot(app_df.EXT_SOURCE_3)
plt.show()
In [38]:
app_df.EXT_SOURCE_3.fillna(app_df.EXT_SOURCE_3.median(),inplace =True)
In [39]:
app_df.EXT_SOURCE_3.isnull().mean()*100
Out[39]:
0.0
In [40]:
app_df.EXT_SOURCE_3.value_counts(normalize =True)*100
Out[40]:
EXT_SOURCE_3
0.535276 20.080908
0.746300 0.474780
0.713631 0.427627
0.694093 0.414945
0.670652 0.387303
...
0.021492 0.000325
0.019468 0.000325
0.023062 0.000325
0.014556 0.000325
0.043227 0.000325
Name: proportion, Length: 814, dtype: float64
In [41]:
null_cols = list(app_df.isna().any())
len(null_cols)
Out[41]:
76
In [43]:
app_df.isnull().mean()*100
Out[43]:
SK_ID_CURR 0.0 TARGET 0.0 NAME_CONTRACT_TYPE 0.0 CODE_GENDER 0.0 FLAG_OWN_CAR 0.0 FLAG_OWN_REALTY 0.0 CNT_CHILDREN 0.0 AMT_INCOME_TOTAL 0.0 AMT_CREDIT 0.0 AMT_ANNUITY 0.0 AMT_GOODS_PRICE 0.0 NAME_INCOME_TYPE 0.0 NAME_EDUCATION_TYPE 0.0 NAME_FAMILY_STATUS 0.0 NAME_HOUSING_TYPE 0.0 REGION_POPULATION_RELATIVE 0.0 DAYS_BIRTH 0.0 DAYS_EMPLOYED 0.0 DAYS_REGISTRATION 0.0 DAYS_ID_PUBLISH 0.0 FLAG_MOBIL 0.0 FLAG_EMP_PHONE 0.0 FLAG_WORK_PHONE 0.0 FLAG_CONT_MOBILE 0.0 FLAG_PHONE 0.0 FLAG_EMAIL 0.0 CNT_FAM_MEMBERS 0.0 REGION_RATING_CLIENT 0.0 REGION_RATING_CLIENT_W_CITY 0.0 WEEKDAY_APPR_PROCESS_START 0.0 HOUR_APPR_PROCESS_START 0.0 REG_REGION_NOT_LIVE_REGION 0.0 REG_REGION_NOT_WORK_REGION 0.0 LIVE_REGION_NOT_WORK_REGION 0.0 REG_CITY_NOT_LIVE_CITY 0.0 REG_CITY_NOT_WORK_CITY 0.0 LIVE_CITY_NOT_WORK_CITY 0.0 ORGANIZATION_TYPE 0.0 EXT_SOURCE_2 0.0 EXT_SOURCE_3 0.0 OBS_30_CNT_SOCIAL_CIRCLE 0.0 DEF_30_CNT_SOCIAL_CIRCLE 0.0 OBS_60_CNT_SOCIAL_CIRCLE 0.0 DEF_60_CNT_SOCIAL_CIRCLE 0.0 DAYS_LAST_PHONE_CHANGE 0.0 FLAG_DOCUMENT_2 0.0 FLAG_DOCUMENT_3 0.0 FLAG_DOCUMENT_4 0.0 FLAG_DOCUMENT_5 0.0 FLAG_DOCUMENT_6 0.0 FLAG_DOCUMENT_7 0.0 FLAG_DOCUMENT_8 0.0 FLAG_DOCUMENT_9 0.0 FLAG_DOCUMENT_10 0.0 FLAG_DOCUMENT_11 0.0 FLAG_DOCUMENT_12 0.0 FLAG_DOCUMENT_13 0.0 FLAG_DOCUMENT_14 0.0 FLAG_DOCUMENT_15 0.0 FLAG_DOCUMENT_16 0.0 FLAG_DOCUMENT_17 0.0 FLAG_DOCUMENT_18 0.0 FLAG_DOCUMENT_19 0.0 FLAG_DOCUMENT_20 0.0 FLAG_DOCUMENT_21 0.0 AMT_REQ_CREDIT_BUREAU_HOUR 0.0 AMT_REQ_CREDIT_BUREAU_DAY 0.0 AMT_REQ_CREDIT_BUREAU_WEEK 0.0 AMT_REQ_CREDIT_BUREAU_MON 0.0 AMT_REQ_CREDIT_BUREAU_QRT 0.0 AMT_REQ_CREDIT_BUREAU_YEAR 0.0 YEARS_BIRTH 0.0 YEARS_EMPLOYED 0.0 YEARS_REGISTRATION 0.0 YEARS_ID_PUBLISH 0.0 YEARS_LAST_PHONE_CHANGE 0.0 dtype: float64
In [45]:
app_df.AMT_REQ_CREDIT_BUREAU_DAY.value_counts(normalize = True)*100
Out[45]:
AMT_REQ_CREDIT_BUREAU_DAY 0.0 99.515790 1.0 0.420148 2.0 0.034470 3.0 0.014634 4.0 0.008455 5.0 0.002927 6.0 0.002602 9.0 0.000650 8.0 0.000325 Name: proportion, dtype: float64
In [47]:
cols = ['AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR']
In [49]:
for col in cols:
app_df[col].fillna(app_df[col].mode()[0],inplace =True)
In [ ]:
app_df.EXT_SOURCE_2.fillna(app_dF.EXT_SOURCE_2.medain(),inplace =True)
In [10]:
for col in app_data.select_dtypes(include = [np.number]).columns:
app_data[col] = app_data[col].abs()
In [11]:
days_columns = ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE']
for col in days_columns:
new_col = col.replace('DAYS', 'YEARS')
app_data[new_col] = app_data[col].apply(lambda x: x // 365)
In [12]:
print(app_data.isnull().sum())
SK_ID_CURR 0
TARGET 0
NAME_CONTRACT_TYPE 0
CODE_GENDER 0
FLAG_OWN_CAR 0
..
YEARS_BIRTH 0
YEARS_EMPLOYED 0
YEARS_REGISTRATION 0
YEARS_ID_PUBLISH 0
YEARS_LAST_PHONE_CHANGE 0
Length: 78, dtype: int64
In [13]:
bins = [0, 200000, 400000, 600000, 800000, 1000000]
labels = ['Very Low Credit', 'Low Credit', 'Medium Credit', 'High Credit', 'Very High Credit']
app_data['AMT_CREDIT_CATEGORY'] = pd.cut(app_data['AMT_CREDIT'], bins=bins,labels=labels)
In [14]:
sns.countplot(data = app_data, x='AMT_CREDIT_CATEGORY')
plt.show()
In [15]:
app_data.OCCUPATION_TYPE.isnull().mean() * 100
Out[15]:
31.345545362604916
In [16]:
app_data.OCCUPATION_TYPE.value_counts(normalize = True) * 100
Out[16]:
OCCUPATION_TYPE Laborers 26.139636 Sales staff 15.205570 Core staff 13.058924 Managers 10.122679 Drivers 8.811576 High skill tech staff 5.390299 Accountants 4.648067 Medicine staff 4.043672 Security staff 3.183498 Cooking staff 2.816408 Cleaning staff 2.203960 Private service staff 1.256158 Low-skill Laborers 0.991379 Waiters/barmen staff 0.638499 Secretaries 0.618132 Realty agents 0.355722 HR staff 0.266673 IT staff 0.249147 Name: proportion, dtype: float64
In [26]:
categorical_cols = []
numerical_cols = []
for col in app_data.columns:
if app_data[col].dtype == 'object' or app_data[col].nunique() < 10:
categorical_cols.append(col)
else:
numerical_cols.append(col)
print(f"Categorical columns: {len(categorical_cols)}")
print(f"Numerical columns: {len(numerical_cols)}")
Categorical columns: 55 Numerical columns: 67
In [29]:
def plot_categorical_analysis(df, categorical_cols, target_col='TARGET'):
plt.figure(figsize=(15, 20))
for i, col in enumerate(categorical_cols[:12]): # Limit to first 12
plt.subplot(4, 3, i+1)
# Count plot
df[col].value_counts().plot(kind='bar')
plt.title(f'Distribution of {col}')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plot_categorical_analysis(app_data, categorical_cols)
In [30]:
def plot_numerical_analysis(df, numerical_cols):
plt.figure(figsize=(15, 20))
for i, col in enumerate(numerical_cols[:12]): # Limit to first 12
plt.subplot(4, 3, i+1)
# Histogram
df[col].hist(bins=30, alpha=0.7)
plt.title(f'Distribution of {col}')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
plot_numerical_analysis(app_data, numerical_cols)
In [32]:
## Univariate Analysis with Target Variable
def plot_categorical_target_analysis(df, categorical_cols, target_col='TARGET'):
plt.figure(figsize=(15, 20))
for i, col in enumerate(categorical_cols[:8]):
plt.subplot(4, 2, i+1)
# Crosstab with percentages
ct = pd.crosstab(df[col], df[target_col], normalize='index') * 100
ct.plot(kind='bar', stacked=True)
plt.title(f'{col} vs Target')
plt.xticks(rotation=45)
plt.legend(['No Difficulty', 'Difficulty'])
plt.tight_layout()
plt.show()
plot_categorical_target_analysis(app_data, categorical_cols)
In [34]:
## Bivariate Analysis
### Correlation analysis
# Correlation matrix for numerical variables
numerical_data = app_data[numerical_cols].select_dtypes(include=[np.number])
plt.figure(figsize=(12, 10))
correlation_matrix = numerical_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Numerical Variables')
plt.show()
# Find highly correlated pairs
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
for j in range(i+1, len(correlation_matrix.columns)):
corr_val = correlation_matrix.iloc[i, j]
if abs(corr_val) > 0.7: # High correlation threshold
high_corr_pairs.append((correlation_matrix.columns[i],
correlation_matrix.columns[j],
corr_val))
print("Highly correlated pairs:")
for pair in high_corr_pairs:
print(f"{pair[0]} - {pair[1]}: {pair[2]:.3f}")
Highly correlated pairs: CNT_CHILDREN - CNT_FAM_MEMBERS: 0.879 AMT_CREDIT - AMT_ANNUITY: 0.770 AMT_CREDIT - AMT_GOODS_PRICE: 0.987 AMT_ANNUITY - AMT_GOODS_PRICE: 0.775 APARTMENTS_AVG - ELEVATORS_AVG: 0.837 APARTMENTS_AVG - LIVINGAPARTMENTS_AVG: 0.944 APARTMENTS_AVG - LIVINGAREA_AVG: 0.914 APARTMENTS_AVG - APARTMENTS_MODE: 0.973 APARTMENTS_AVG - ELEVATORS_MODE: 0.823 APARTMENTS_AVG - LIVINGAPARTMENTS_MODE: 0.931 APARTMENTS_AVG - LIVINGAREA_MODE: 0.893 APARTMENTS_AVG - APARTMENTS_MEDI: 0.995 APARTMENTS_AVG - ELEVATORS_MEDI: 0.835 APARTMENTS_AVG - LIVINGAPARTMENTS_MEDI: 0.942 APARTMENTS_AVG - LIVINGAREA_MEDI: 0.912 APARTMENTS_AVG - TOTALAREA_MODE: 0.893 BASEMENTAREA_AVG - BASEMENTAREA_MODE: 0.973 BASEMENTAREA_AVG - BASEMENTAREA_MEDI: 0.994 YEARS_BEGINEXPLUATATION_AVG - YEARS_BEGINEXPLUATATION_MODE: 0.972 YEARS_BEGINEXPLUATATION_AVG - YEARS_BEGINEXPLUATATION_MEDI: 0.994 YEARS_BUILD_AVG - YEARS_BUILD_MODE: 0.989 YEARS_BUILD_AVG - YEARS_BUILD_MEDI: 0.998 COMMONAREA_AVG - COMMONAREA_MODE: 0.977 COMMONAREA_AVG - COMMONAREA_MEDI: 0.996 ELEVATORS_AVG - LIVINGAPARTMENTS_AVG: 0.812 ELEVATORS_AVG - LIVINGAREA_AVG: 0.868 ELEVATORS_AVG - APARTMENTS_MODE: 0.806 ELEVATORS_AVG - ELEVATORS_MODE: 0.979 ELEVATORS_AVG - LIVINGAPARTMENTS_MODE: 0.797 ELEVATORS_AVG - LIVINGAREA_MODE: 0.839 ELEVATORS_AVG - APARTMENTS_MEDI: 0.835 ELEVATORS_AVG - ELEVATORS_MEDI: 0.996 ELEVATORS_AVG - LIVINGAPARTMENTS_MEDI: 0.813 ELEVATORS_AVG - LIVINGAREA_MEDI: 0.866 ELEVATORS_AVG - TOTALAREA_MODE: 0.845 ENTRANCES_AVG - ENTRANCES_MODE: 0.978 ENTRANCES_AVG - ENTRANCES_MEDI: 0.997 FLOORSMAX_AVG - FLOORSMIN_AVG: 0.742 FLOORSMAX_AVG - FLOORSMAX_MODE: 0.986 FLOORSMAX_AVG - FLOORSMIN_MODE: 0.722 FLOORSMAX_AVG - FLOORSMAX_MEDI: 0.997 FLOORSMAX_AVG - FLOORSMIN_MEDI: 0.740 FLOORSMIN_AVG - FLOORSMAX_MODE: 0.729 FLOORSMIN_AVG - FLOORSMIN_MODE: 0.986 FLOORSMIN_AVG - FLOORSMAX_MEDI: 0.740 FLOORSMIN_AVG - FLOORSMIN_MEDI: 0.997 LANDAREA_AVG - LANDAREA_MODE: 0.974 LANDAREA_AVG - LANDAREA_MEDI: 0.992 LIVINGAPARTMENTS_AVG - LIVINGAREA_AVG: 0.881 LIVINGAPARTMENTS_AVG - APARTMENTS_MODE: 0.908 LIVINGAPARTMENTS_AVG - ELEVATORS_MODE: 0.794 LIVINGAPARTMENTS_AVG - LIVINGAPARTMENTS_MODE: 0.970 LIVINGAPARTMENTS_AVG - LIVINGAREA_MODE: 0.852 LIVINGAPARTMENTS_AVG - APARTMENTS_MEDI: 0.936 LIVINGAPARTMENTS_AVG - ELEVATORS_MEDI: 0.809 LIVINGAPARTMENTS_AVG - LIVINGAPARTMENTS_MEDI: 0.994 LIVINGAPARTMENTS_AVG - LIVINGAREA_MEDI: 0.878 LIVINGAPARTMENTS_AVG - TOTALAREA_MODE: 0.848 LIVINGAREA_AVG - APARTMENTS_MODE: 0.891 LIVINGAREA_AVG - ELEVATORS_MODE: 0.853 LIVINGAREA_AVG - LIVINGAPARTMENTS_MODE: 0.873 LIVINGAREA_AVG - LIVINGAREA_MODE: 0.972 LIVINGAREA_AVG - APARTMENTS_MEDI: 0.913 LIVINGAREA_AVG - ELEVATORS_MEDI: 0.866 LIVINGAREA_AVG - LIVINGAPARTMENTS_MEDI: 0.883 LIVINGAREA_AVG - LIVINGAREA_MEDI: 0.996 LIVINGAREA_AVG - TOTALAREA_MODE: 0.925 NONLIVINGAPARTMENTS_AVG - NONLIVINGAPARTMENTS_MODE: 0.969 NONLIVINGAPARTMENTS_AVG - NONLIVINGAPARTMENTS_MEDI: 0.991 NONLIVINGAREA_AVG - NONLIVINGAREA_MODE: 0.966 NONLIVINGAREA_AVG - NONLIVINGAREA_MEDI: 0.990 APARTMENTS_MODE - ELEVATORS_MODE: 0.826 APARTMENTS_MODE - LIVINGAPARTMENTS_MODE: 0.938 APARTMENTS_MODE - LIVINGAREA_MODE: 0.910 APARTMENTS_MODE - APARTMENTS_MEDI: 0.977 APARTMENTS_MODE - ELEVATORS_MEDI: 0.809 APARTMENTS_MODE - LIVINGAPARTMENTS_MEDI: 0.915 APARTMENTS_MODE - LIVINGAREA_MEDI: 0.894 APARTMENTS_MODE - TOTALAREA_MODE: 0.864 BASEMENTAREA_MODE - BASEMENTAREA_MEDI: 0.978 YEARS_BEGINEXPLUATATION_MODE - YEARS_BEGINEXPLUATATION_MEDI: 0.964 YEARS_BUILD_MODE - YEARS_BUILD_MEDI: 0.989 COMMONAREA_MODE - COMMONAREA_MEDI: 0.980 ELEVATORS_MODE - LIVINGAPARTMENTS_MODE: 0.808 ELEVATORS_MODE - LIVINGAREA_MODE: 0.856 ELEVATORS_MODE - APARTMENTS_MEDI: 0.826 ELEVATORS_MODE - ELEVATORS_MEDI: 0.983 ELEVATORS_MODE - LIVINGAPARTMENTS_MEDI: 0.799 ELEVATORS_MODE - LIVINGAREA_MEDI: 0.856 ELEVATORS_MODE - TOTALAREA_MODE: 0.821 ENTRANCES_MODE - ENTRANCES_MEDI: 0.981 FLOORSMAX_MODE - FLOORSMIN_MODE: 0.726 FLOORSMAX_MODE - FLOORSMAX_MEDI: 0.988 FLOORSMAX_MODE - FLOORSMIN_MEDI: 0.730 FLOORSMIN_MODE - FLOORSMAX_MEDI: 0.723 FLOORSMIN_MODE - FLOORSMIN_MEDI: 0.988 LANDAREA_MODE - LANDAREA_MEDI: 0.981 LIVINGAPARTMENTS_MODE - LIVINGAREA_MODE: 0.878 LIVINGAPARTMENTS_MODE - APARTMENTS_MEDI: 0.932 LIVINGAPARTMENTS_MODE - ELEVATORS_MEDI: 0.799 LIVINGAPARTMENTS_MODE - LIVINGAPARTMENTS_MEDI: 0.976 LIVINGAPARTMENTS_MODE - LIVINGAREA_MEDI: 0.874 LIVINGAPARTMENTS_MODE - TOTALAREA_MODE: 0.834 LIVINGAREA_MODE - APARTMENTS_MEDI: 0.896 LIVINGAREA_MODE - ELEVATORS_MEDI: 0.841 LIVINGAREA_MODE - LIVINGAPARTMENTS_MEDI: 0.857 LIVINGAREA_MODE - LIVINGAREA_MEDI: 0.975 LIVINGAREA_MODE - TOTALAREA_MODE: 0.899 NONLIVINGAPARTMENTS_MODE - NONLIVINGAPARTMENTS_MEDI: 0.979 NONLIVINGAREA_MODE - NONLIVINGAREA_MEDI: 0.976 APARTMENTS_MEDI - ELEVATORS_MEDI: 0.837 APARTMENTS_MEDI - LIVINGAPARTMENTS_MEDI: 0.942 APARTMENTS_MEDI - LIVINGAREA_MEDI: 0.916 APARTMENTS_MEDI - TOTALAREA_MODE: 0.887 ELEVATORS_MEDI - LIVINGAPARTMENTS_MEDI: 0.814 ELEVATORS_MEDI - LIVINGAREA_MEDI: 0.868 ELEVATORS_MEDI - TOTALAREA_MODE: 0.838 FLOORSMAX_MEDI - FLOORSMIN_MEDI: 0.740 LIVINGAPARTMENTS_MEDI - LIVINGAREA_MEDI: 0.885 LIVINGAPARTMENTS_MEDI - TOTALAREA_MODE: 0.846 LIVINGAREA_MEDI - TOTALAREA_MODE: 0.919 OBS_30_CNT_SOCIAL_CIRCLE - OBS_60_CNT_SOCIAL_CIRCLE: 0.998
In [35]:
### Pair plots for key variables
# Create pair plots for important variables
key_vars = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'YEARS_BIRTH', 'TARGET']
if all(var in app_data.columns for var in key_vars):
sns.pairplot(app_data[key_vars], hue='TARGET', diag_kind='hist')
plt.show()
In [36]:
## Bivariate Analysis - Numerical vs Target
# Box plots for numerical variables vs target
def plot_numerical_vs_target(df, numerical_cols, target_col='TARGET'):
plt.figure(figsize=(15, 20))
for i, col in enumerate(numerical_cols[:12]):
plt.subplot(4, 3, i+1)
# Box plot
df.boxplot(column=col, by=target_col, ax=plt.gca())
plt.title(f'{col} by Target')
plt.suptitle('') # Remove default title
plt.tight_layout()
plt.show()
plot_numerical_vs_target(app_data, numerical_cols[:8])
In [37]:
## Multivariate Analysis
### Three-way analysis
# Analyze relationships between multiple variables
def multivariate_analysis(df, cat_var1, cat_var2, target_col='TARGET'):
# Create crosstab
ct = pd.crosstab([df[cat_var1], df[cat_var2]], df[target_col])
# Calculate percentages
ct_pct = pd.crosstab([df[cat_var1], df[cat_var2]], df[target_col],
normalize='index') * 100
print(f"Multivariate Analysis: {cat_var1} x {cat_var2} x {target_col}")
print("Counts:")
print(ct)
print("\nPercentages:")
print(ct_pct)
# Visualization
plt.figure(figsize=(12, 6))
ct_pct.plot(kind='bar')
plt.title(f'{cat_var1} x {cat_var2} vs Target (%)')
plt.xticks(rotation=45)
plt.show()
# Example multivariate analysis
if 'NAME_CONTRACT_TYPE' in app_data.columns and 'CODE_GENDER' in app_data.columns:
multivariate_analysis(app_data, 'NAME_CONTRACT_TYPE', 'CODE_GENDER')
Multivariate Analysis: NAME_CONTRACT_TYPE x CODE_GENDER x TARGET
Counts:
TARGET 0 1
NAME_CONTRACT_TYPE CODE_GENDER
Cash loans F 169673 13127
M 85338 10094
Revolving loans F 18605 1043
M 9066 561
XNA 4 0
Percentages:
TARGET 0 1
NAME_CONTRACT_TYPE CODE_GENDER
Cash loans F 92.818928 7.181072
M 89.422835 10.577165
Revolving loans F 94.691572 5.308428
M 94.172639 5.827361
XNA 100.000000 0.000000
<Figure size 1200x600 with 0 Axes>
In [38]:
## Advanced Analysis - Target-wise Correlation
# Separate data by target values
target_0 = app_data[app_data['TARGET'] == 0]
target_1 = app_data[app_data['TARGET'] == 1]
# Correlation for non-defaulters (TARGET = 0)
plt.figure(figsize=(10, 8))
corr_0 = target_0[numerical_cols].select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_0, annot=True, cmap='Blues', center=0, fmt='.2f')
plt.title('Correlation Matrix - Non-Defaulters (TARGET = 0)')
plt.show()
# Correlation for defaulters (TARGET = 1)
plt.figure(figsize=(10, 8))
corr_1 = target_1[numerical_cols].select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_1, annot=True, cmap='Reds', center=0, fmt='.2f')
plt.title('Correlation Matrix - Defaulters (TARGET = 1)')
plt.show()
In [39]:
## Previous Application Analysis
# Load and analyze previous application data
print("Previous Application Analysis")
print("Shape:", prev_data.shape)
# Check missing values in previous data
prev_missing = (prev_data.isnull().sum() / len(prev_data)) * 100
prev_missing_df = prev_missing[prev_missing > 0].sort_values(ascending=False)
print("Missing values in Previous Application:")
print(prev_missing_df.head(10))
# Similar cleaning process for previous data
# Drop columns with >49% missing values
prev_threshold = 0.49
prev_high_missing = prev_missing_df[prev_missing_df > prev_threshold * 100].index
prev_data_clean = prev_data.drop(columns=prev_high_missing)
print(f"Previous data columns after cleaning: {len(prev_data_clean.columns)}")
Previous Application Analysis Shape: (1670214, 37) Missing values in Previous Application: RATE_INTEREST_PRIMARY 99.643698 RATE_INTEREST_PRIVILEGED 99.643698 AMT_DOWN_PAYMENT 53.636480 RATE_DOWN_PAYMENT 53.636480 NAME_TYPE_SUITE 49.119754 DAYS_FIRST_DRAWING 40.298129 DAYS_FIRST_DUE 40.298129 DAYS_LAST_DUE_1ST_VERSION 40.298129 DAYS_LAST_DUE 40.298129 DAYS_TERMINATION 40.298129 dtype: float64 Previous data columns after cleaning: 32
In [41]:
## Merge Analysis
# Merge application data with previous application data
# Group previous applications by SK_ID_CURR and create aggregated features
prev_agg = prev_data_clean.groupby('SK_ID_CURR').agg({
'AMT_ANNUITY': ['mean', 'max', 'min'],
'AMT_APPLICATION': ['mean', 'max', 'min'],
'AMT_CREDIT': ['mean', 'max', 'min'],
'AMT_GOODS_PRICE': ['mean', 'max', 'min']
}).reset_index()
# Flatten column names
prev_agg.columns = ['SK_ID_CURR'] + ['_'.join(col).strip() for col in prev_agg.columns[1:]]
# Merge with application data
merged_data = app_data.merge(prev_agg, on='SK_ID_CURR', how='left')
print(f"Merged data shape: {merged_data.shape}")
Merged data shape: (307511, 134)
In [43]:
merged_data = pd.merge(app_data, prev_data, on='SK_ID_CURR', how='left')
# Drop columns starting with 'FLAG'
cols_to_drop = [col for col in merged_data.columns if col.startswith('FLAG')]
merged_data.drop(columns=cols_to_drop,inplace=True)
In [44]:
print(merged_data.head(7))
SK_ID_CURR TARGET NAME_CONTRACT_TYPE_x CODE_GENDER CNT_CHILDREN \
0 100002 1 Cash loans M 0
1 100003 0 Cash loans F 0
2 100003 0 Cash loans F 0
3 100003 0 Cash loans F 0
4 100004 0 Revolving loans M 0
5 100006 0 Cash loans F 0
6 100006 0 Cash loans F 0
AMT_INCOME_TOTAL AMT_CREDIT_x AMT_ANNUITY_x AMT_GOODS_PRICE_x \
0 202500.0 406597.5 24700.5 351000.0
1 270000.0 1293502.5 35698.5 1129500.0
2 270000.0 1293502.5 35698.5 1129500.0
3 270000.0 1293502.5 35698.5 1129500.0
4 67500.0 135000.0 6750.0 135000.0
5 135000.0 312682.5 29686.5 297000.0
6 135000.0 312682.5 29686.5 297000.0
NAME_TYPE_SUITE_x NAME_INCOME_TYPE NAME_EDUCATION_TYPE \
0 Unaccompanied Working Secondary / secondary special
1 Family State servant Higher education
2 Family State servant Higher education
3 Family State servant Higher education
4 Unaccompanied Working Secondary / secondary special
5 Unaccompanied Working Secondary / secondary special
6 Unaccompanied Working Secondary / secondary special
NAME_FAMILY_STATUS NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE \
0 Single / not married House / apartment 0.018801
1 Married House / apartment 0.003541
2 Married House / apartment 0.003541
3 Married House / apartment 0.003541
4 Single / not married House / apartment 0.010032
5 Civil marriage House / apartment 0.008019
6 Civil marriage House / apartment 0.008019
DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE \
0 -9461 -637 -3648.0 -2120 NaN
1 -16765 -1188 -1186.0 -291 NaN
2 -16765 -1188 -1186.0 -291 NaN
3 -16765 -1188 -1186.0 -291 NaN
4 -19046 -225 -4260.0 -2531 26.0
5 -19005 -3039 -9833.0 -2437 NaN
6 -19005 -3039 -9833.0 -2437 NaN
OCCUPATION_TYPE CNT_FAM_MEMBERS REGION_RATING_CLIENT \
0 Laborers 1.0 2
1 Core staff 2.0 1
2 Core staff 2.0 1
3 Core staff 2.0 1
4 Laborers 1.0 2
5 Laborers 2.0 2
6 Laborers 2.0 2
REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START_x \
0 2 WEDNESDAY
1 1 MONDAY
2 1 MONDAY
3 1 MONDAY
4 2 MONDAY
5 2 WEDNESDAY
6 2 WEDNESDAY
HOUR_APPR_PROCESS_START_x REG_REGION_NOT_LIVE_REGION \
0 10 0
1 11 0
2 11 0
3 11 0
4 9 0
5 17 0
6 17 0
REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION \
0 0 0
1 0 0
2 0 0
3 0 0
4 0 0
5 0 0
6 0 0
REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY \
0 0 0 0
1 0 0 0
2 0 0 0
3 0 0 0
4 0 0 0
5 0 0 0
6 0 0 0
ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 \
0 Business Entity Type 3 0.083037 0.262949 0.139376
1 School 0.311267 0.622246 NaN
2 School 0.311267 0.622246 NaN
3 School 0.311267 0.622246 NaN
4 Government NaN 0.555912 0.729567
5 Business Entity Type 3 NaN 0.650442 NaN
6 Business Entity Type 3 NaN 0.650442 NaN
APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG \
0 0.0247 0.0369 0.9722
1 0.0959 0.0529 0.9851
2 0.0959 0.0529 0.9851
3 0.0959 0.0529 0.9851
4 NaN NaN NaN
5 NaN NaN NaN
6 NaN NaN NaN
YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG \
0 0.6192 0.0143 0.00 0.0690
1 0.7960 0.0605 0.08 0.0345
2 0.7960 0.0605 0.08 0.0345
3 0.7960 0.0605 0.08 0.0345
4 NaN NaN NaN NaN
5 NaN NaN NaN NaN
6 NaN NaN NaN NaN
FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG \
0 0.0833 0.1250 0.0369 0.0202
1 0.2917 0.3333 0.0130 0.0773
2 0.2917 0.3333 0.0130 0.0773
3 0.2917 0.3333 0.0130 0.0773
4 NaN NaN NaN NaN
5 NaN NaN NaN NaN
6 NaN NaN NaN NaN
LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG \
0 0.0190 0.0000 0.0000
1 0.0549 0.0039 0.0098
2 0.0549 0.0039 0.0098
3 0.0549 0.0039 0.0098
4 NaN NaN NaN
5 NaN NaN NaN
6 NaN NaN NaN
APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE \
0 0.0252 0.0383 0.9722
1 0.0924 0.0538 0.9851
2 0.0924 0.0538 0.9851
3 0.0924 0.0538 0.9851
4 NaN NaN NaN
5 NaN NaN NaN
6 NaN NaN NaN
YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE \
0 0.6341 0.0144 0.0000 0.0690
1 0.8040 0.0497 0.0806 0.0345
2 0.8040 0.0497 0.0806 0.0345
3 0.8040 0.0497 0.0806 0.0345
4 NaN NaN NaN NaN
5 NaN NaN NaN NaN
6 NaN NaN NaN NaN
FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE \
0 0.0833 0.1250 0.0377 0.022
1 0.2917 0.3333 0.0128 0.079
2 0.2917 0.3333 0.0128 0.079
3 0.2917 0.3333 0.0128 0.079
4 NaN NaN NaN NaN
5 NaN NaN NaN NaN
6 NaN NaN NaN NaN
LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE \
0 0.0198 0.0 0.0
1 0.0554 0.0 0.0
2 0.0554 0.0 0.0
3 0.0554 0.0 0.0
4 NaN NaN NaN
5 NaN NaN NaN
6 NaN NaN NaN
APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI \
0 0.0250 0.0369 0.9722
1 0.0968 0.0529 0.9851
2 0.0968 0.0529 0.9851
3 0.0968 0.0529 0.9851
4 NaN NaN NaN
5 NaN NaN NaN
6 NaN NaN NaN
YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI \
0 0.6243 0.0144 0.00 0.0690
1 0.7987 0.0608 0.08 0.0345
2 0.7987 0.0608 0.08 0.0345
3 0.7987 0.0608 0.08 0.0345
4 NaN NaN NaN NaN
5 NaN NaN NaN NaN
6 NaN NaN NaN NaN
FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI \
0 0.0833 0.1250 0.0375 0.0205
1 0.2917 0.3333 0.0132 0.0787
2 0.2917 0.3333 0.0132 0.0787
3 0.2917 0.3333 0.0132 0.0787
4 NaN NaN NaN NaN
5 NaN NaN NaN NaN
6 NaN NaN NaN NaN
LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI \
0 0.0193 0.0000 0.00
1 0.0558 0.0039 0.01
2 0.0558 0.0039 0.01
3 0.0558 0.0039 0.01
4 NaN NaN NaN
5 NaN NaN NaN
6 NaN NaN NaN
FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE \
0 reg oper account block of flats 0.0149 Stone, brick
1 reg oper account block of flats 0.0714 Block
2 reg oper account block of flats 0.0714 Block
3 reg oper account block of flats 0.0714 Block
4 NaN NaN NaN NaN
5 NaN NaN NaN NaN
6 NaN NaN NaN NaN
EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE \
0 No 2.0 2.0
1 No 1.0 0.0
2 No 1.0 0.0
3 No 1.0 0.0
4 NaN 0.0 0.0
5 NaN 2.0 0.0
6 NaN 2.0 0.0
OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE \
0 2.0 2.0 -1134.0
1 1.0 0.0 -828.0
2 1.0 0.0 -828.0
3 1.0 0.0 -828.0
4 0.0 0.0 -815.0
5 2.0 0.0 -617.0
6 2.0 0.0 -617.0
AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY \
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 0.0 0.0
4 0.0 0.0
5 NaN NaN
6 NaN NaN
AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON \
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 0.0 0.0
4 0.0 0.0
5 NaN NaN
6 NaN NaN
AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR SK_ID_PREV \
0 0.0 1.0 1038818.0
1 0.0 0.0 1810518.0
2 0.0 0.0 2636178.0
3 0.0 0.0 2396755.0
4 0.0 0.0 1564014.0
5 NaN NaN 2078043.0
6 NaN NaN 2827850.0
NAME_CONTRACT_TYPE_y AMT_ANNUITY_y AMT_APPLICATION AMT_CREDIT_y \
0 Consumer loans 9251.775 179055.0 179055.0
1 Cash loans 98356.995 900000.0 1035882.0
2 Consumer loans 64567.665 337500.0 348637.5
3 Consumer loans 6737.310 68809.5 68053.5
4 Consumer loans 5357.250 24282.0 20106.0
5 Cash loans 24246.000 675000.0 675000.0
6 Revolving loans NaN 0.0 0.0
AMT_DOWN_PAYMENT AMT_GOODS_PRICE_y WEEKDAY_APPR_PROCESS_START_y \
0 0.0 179055.0 SATURDAY
1 NaN 900000.0 FRIDAY
2 0.0 337500.0 SUNDAY
3 6885.0 68809.5 SATURDAY
4 4860.0 24282.0 FRIDAY
5 NaN 675000.0 THURSDAY
6 NaN NaN THURSDAY
HOUR_APPR_PROCESS_START_y NFLAG_LAST_APPL_IN_DAY RATE_DOWN_PAYMENT \
0 9.0 1.0 0.000000
1 12.0 1.0 NaN
2 17.0 1.0 0.000000
3 15.0 1.0 0.100061
4 5.0 1.0 0.212008
5 15.0 1.0 NaN
6 15.0 1.0 NaN
RATE_INTEREST_PRIMARY RATE_INTEREST_PRIVILEGED NAME_CASH_LOAN_PURPOSE \
0 NaN NaN XAP
1 NaN NaN XNA
2 NaN NaN XAP
3 NaN NaN XAP
4 NaN NaN XAP
5 NaN NaN XNA
6 NaN NaN XAP
NAME_CONTRACT_STATUS DAYS_DECISION NAME_PAYMENT_TYPE \
0 Approved -606.0 XNA
1 Approved -746.0 XNA
2 Approved -828.0 Cash through the bank
3 Approved -2341.0 Cash through the bank
4 Approved -815.0 Cash through the bank
5 Approved -181.0 Cash through the bank
6 Canceled -181.0 XNA
CODE_REJECT_REASON NAME_TYPE_SUITE_y NAME_CLIENT_TYPE NAME_GOODS_CATEGORY \
0 XAP NaN New Vehicles
1 XAP Unaccompanied Repeater XNA
2 XAP Family Refreshed Furniture
3 XAP Family Refreshed Consumer Electronics
4 XAP Unaccompanied New Mobile
5 XAP Unaccompanied Repeater XNA
6 XAP NaN Repeater XNA
NAME_PORTFOLIO NAME_PRODUCT_TYPE CHANNEL_TYPE SELLERPLACE_AREA \
0 POS XNA Stone 500.0
1 Cash x-sell Credit and cash offices -1.0
2 POS XNA Stone 1400.0
3 POS XNA Country-wide 200.0
4 POS XNA Regional / Local 30.0
5 Cash x-sell Credit and cash offices -1.0
6 XNA XNA Credit and cash offices -1.0
NAME_SELLER_INDUSTRY CNT_PAYMENT NAME_YIELD_GROUP \
0 Auto technology 24.0 low_normal
1 XNA 12.0 low_normal
2 Furniture 6.0 middle
3 Consumer electronics 12.0 middle
4 Connectivity 4.0 middle
5 XNA 48.0 low_normal
6 XNA NaN XNA
PRODUCT_COMBINATION DAYS_FIRST_DRAWING DAYS_FIRST_DUE \
0 POS other with interest 365243.0 -565.0
1 Cash X-Sell: low 365243.0 -716.0
2 POS industry with interest 365243.0 -797.0
3 POS household with interest 365243.0 -2310.0
4 POS mobile without interest 365243.0 -784.0
5 Cash X-Sell: low 365243.0 -151.0
6 Card Street NaN NaN
DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION \
0 125.0 -25.0 -17.0
1 -386.0 -536.0 -527.0
2 -647.0 -647.0 -639.0
3 -1980.0 -1980.0 -1976.0
4 -694.0 -724.0 -714.0
5 1259.0 -151.0 -143.0
6 NaN NaN NaN
NFLAG_INSURED_ON_APPROVAL
0 0.0
1 1.0
2 0.0
3 1.0
4 0.0
5 0.0
6 NaN
In [46]:
## Key Insights and Conclusions
# Generate summary insights
def generate_insights(df, target_col='TARGET'):
insights = []
# Target distribution
target_dist = df[target_col].value_counts(normalize=True) * 100
insights.append(f"Target Distribution: {target_dist[0]:.1f}% Non-defaulters, {target_dist[1]:.1f}% Defaulters")
# Gender analysis
if 'CODE_GENDER' in df.columns:
gender_default = pd.crosstab(df['CODE_GENDER'], df[target_col], normalize='index') * 100
insights.append(f"Gender Analysis: Female default rate: {gender_default.loc['F', 1]:.1f}%, Male default rate: {gender_default.loc['M', 1]:.1f}%")
# Age group analysis
if 'AGE_GROUP' in df.columns:
age_default = pd.crosstab(df['AGE_GROUP'], df[target_col], normalize='index') * 100
insights.append("Age Group Default Rates:")
for age_group in age_default.index:
insights.append(f" {age_group}: {age_default.loc[age_group, 1]:.1f}%")
return insights
insights = generate_insights(app_data)
for insight in insights:
print(insight)
Target Distribution: 91.9% Non-defaulters, 8.1% Defaulters Gender Analysis: Female default rate: 7.0%, Male default rate: 10.1%
In [ ]: